In [2]:
import pandas as pd
import numpy as np
import datetime as dt

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import pycountry
import re
import plotly.express as px

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import matplotlib
from matplotlib.pyplot import figure
%matplotlib inline
In [3]:
post_df=pd.read_csv('data_posts.csv')
In [4]:
#convert created date to normal datetime
post_df['created_date']=post_df['post_created_utc'].apply(lambda x:dt.datetime.fromtimestamp(x))
post_df['created_year']=post_df['created_date'].dt.year
In [5]:
post_df
Out[5]:
post_id post_title post_text post_url post_score post_num_comments post_upvote_ratio post_created_utc created_date created_year
0 75jx65 My brother, Adeolu Ogunniyi (24) has been miss... NaN https://imgur.com/a/S4AlS 34248 371 0.91 1.507667e+09 2017-10-11 03:30:36 2017
1 9qyu4g How to shake someone at a hostel? I’m travelling solo for the first time in Japa... https://www.reddit.com/r/solotravel/comments/9... 8296 138 0.95 1.540381e+09 2018-10-24 18:37:13 2018
2 5m30z6 I told myself I would go solo travel through E... NaN https://i.reddituploads.com/11b13e9b9d2b417688... 7178 198 0.92 1.483576e+09 2017-01-05 07:29:54 2017
3 16c1of1 The number of old sex tourists in Bangkok is i... I am currently in Bangkok and the number of se... https://www.reddit.com/r/solotravel/comments/1... 5436 624 0.82 1.694048e+09 2023-09-07 07:50:58 2023
4 fj4v2p For those of you still travelling Europe despi... I've seen about a million threads on this toda... https://www.reddit.com/r/solotravel/comments/f... 4777 544 0.95 1.584293e+09 2020-03-16 00:25:20 2020
... ... ... ... ... ... ... ... ... ... ...
995 o3kul3 Is it a bad idea to quit a secure job to trave... I've been at my job for quite a few years now,... https://www.reddit.com/r/solotravel/comments/o... 364 231 0.94 1.624123e+09 2021-06-20 00:14:40 2021
996 mijvzb How important is it to you to meet people whil... I know everyone will have a different opinion ... https://www.reddit.com/r/solotravel/comments/m... 368 159 0.95 1.617368e+09 2021-04-02 20:01:11 2021
997 lwta89 Sites I check before travelling alone during t... Hi guys. I’ve travelled a few times (during th... https://www.reddit.com/r/solotravel/comments/l... 368 110 0.87 1.614776e+09 2021-03-03 19:58:47 2021
998 jbnzw8 In which country you felt people were more fri... Got some friends who were in Iran, and they ab... https://www.reddit.com/r/solotravel/comments/j... 360 538 0.95 1.602770e+09 2020-10-15 20:55:56 2020
999 exmw4q What's the number one thing you look for when ... Personally... A curtain for the beds. Severely... https://www.reddit.com/r/solotravel/comments/e... 365 229 0.97 1.580641e+09 2020-02-02 18:00:01 2020

1000 rows × 10 columns

In [8]:
#create text
text="".join(str(p) for p in post_df['post_title'])
#generate word cloud:
def generate_wordcloud(new_text):
    #Create stopwords:
    stopwords=set(STOPWORDS)
    stopwords.update(["https", "travel","I", "Edit","EDIT","etc","imgur","html","Hostel"
                      "Update","anyone","want"])
   
    # Generate a word cloud image
    wordcloud = WordCloud(stopwords=stopwords,width=800,height=800,min_font_size=10,
    background_color="white",colormap="Set2",collocation_threshold =3).generate(text)
    fig = plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.tight_layout(pad=0)
    
    plt.show()

    return fig

text_cloud = generate_wordcloud(text)
   
    
In [9]:
from ipywidgets import interact

def wordcloud_year(selected_year):
    df_data=post_df[post_df['created_year']==selected_year]
    text=''.join(str(p) for p in df_data['post_title'])
    #Create stopwords:
    stopwords=set(STOPWORDS)
   
    # Generate a word cloud image
    wordcloud = WordCloud(stopwords=stopwords,width=500,height=500,min_font_size=10,
    background_color="white",colormap="Set2",collocation_threshold=3).generate(text)
    fig=plt.figure(figsize=(8, 8), facecolor=None)
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.tight_layout(pad=0)

    plt.show()
    
interact(wordcloud_year,selected_year=post_df['created_year'].unique())
interactive(children=(Dropdown(description='selected_year', options=(2017, 2018, 2023, 2020, 2019, 2016, 2022,…
Out[9]:
<function __main__.wordcloud_year(selected_year)>

Map the mentioned destinations in post texts.¶

In [10]:
# Extract destinations in post texts.
def find_destination(text_col):
    countries=[]
    text = ''.join(str(p) for p in text_col)
    for country in pycountry.countries:
        if country.name in text:
            countries.append(country.name)
    return countries
country_list=find_destination(post_df['post_text'])
In [11]:
map_data={'country':country_list}
map_data=pd.DataFrame(map_data)
map_data.head()
Out[11]:
country
0 Aruba
1 Afghanistan
2 Angola
3 Albania
4 Andorra
In [12]:
# Generate country code based on country name
def code_finding(country):
    try:
        code=pycountry.countries.get(name=country).alpha_3
    except:
        code=None
    return code
In [13]:
map_data['country_code']=map_data['country'].apply(lambda x:code_finding(x))
In [14]:
# map destinations for solo travelers
fig=px.scatter_geo(map_data, locations='country_code',hover_name='country')
fig.update_layout(title="Destinations choosen by solo travellers(total=1000 posts)")
fig.show()
In [15]:
post_df.loc[:,'post_text']=post_df['post_text'].fillna('')
In [16]:
#find the occurrences of keywords of solo traveler segmentation.
def find_words(search_pattern,text_col):
    count=0
    for text in text_col:
        matches=re.findall(search_pattern,text, re.IGNORECASE)
        count+=len(matches)
    return count
        

search_patterns=map_data['country']
total_count=[]
for p in search_patterns:
    res=find_words(p,post_df['post_text'])
    total_count.append(res)
print(total_count)
In [18]:
map_data['count_country']=total_count
map_data.head()
Out[18]:
country country_code count_country
0 Aruba ABW 1
1 Afghanistan AFG 7
2 Angola AGO 2
3 Albania ALB 20
4 Andorra AND 1
In [19]:
# top 10 destinations for solo travelling
top_10_countries=map_data.sort_values(by='count_country', ascending=False)[:10]
top_10_countries.reset_index()

fig = px.bar(
    top_10_countries,
    x="count_country",
    y="country",
    labels={"count_country": "Count", "country": "Country "},text_auto=True,
    orientation='h',
    template="plotly_dark"
).update_yaxes(
    categoryorder="total ascending",
)

fig.update_layout(
    title="Top 10 destinations for solo travelling",
    
)
fig

Posts with the most upvotes/comments¶

In [54]:
pd.set_option('display.max_colwidth', 500)
In [55]:
# posts with the most scores
post_scores=post_df[['post_title','post_score']].sort_values(by='post_score',ascending=False)[:10]
post_scores
Out[55]:
post_title post_score
0 My brother, Adeolu Ogunniyi (24) has been missing since September 10, 2017. He was backpacking in Central America and last seen at Laguna De Apoyo in Nicaragua. If you've seen him or heard anything PLEASE contact me. (more details in the description) 34248
1 How to shake someone at a hostel? 8296
2 I told myself I would go solo travel through Europe if I ever made it 6 months without a seizure. Today is my first day abroad :) 7178
3 The number of old sex tourists in Bangkok is insane 5436
4 For those of you still travelling Europe despite the restrictions, GO HOME. 4777
5 Solo travel means waking up early when you want. No crowds at the Treasury! 4713
6 Afraid to go back to my hostel room 4636
7 Solo travel at any age - my first time backpacking Europe (age 17) VS last week, 22 years later - Prague 3553
8 I just shat in my hostel bed, what to do next? 3488
9 You guys told me about Hostel Uppelink in Ghent, Belgium and the view that some of the rooms had, I still wasn't expecting it to be quite like this... 3465
In [56]:
#posts with the most comments
post_comments=post_df[['post_title','post_num_comments']].sort_values(by='post_num_comments',ascending=False)[:10]
post_comments
Out[56]:
post_title post_num_comments
621 What is a popular traveling spot that seems unappealing to you? 1249
436 What's a country you'd love to visit, but can't/won't? (non-pandemic reasons) 995
279 Top three favorite cities in the world? 989
426 What's one city that exceeded your expectations and one that left you a bit disappointed? 962
255 What is the worst poverty you have come across on your travels? 949
808 What city/place did you NOT feel safe in? 868
458 What are the tourist traps in your city that should be avoided? 863
553 Places you have visited and would not return? 846
10 An unfortunate reminder for other young female solo travelers 775
343 Rant about how Coronavirus ruined your trip 764
In [22]:
# Number of posts over the years
post_by_year=post_df[['post_id','created_year']].groupby('created_year').size().reset_index(name='count')


# plot the line graph for numbers of posts over the years

fig=px.line(post_by_year,x='created_year',y='count',title='Number of posts through the time(total posts=1000)')
fig.show()
In [23]:
# Relationship of upvote number and comment number
fig=px.scatter(post_df,x='post_score',y='post_num_comments',title='Upvote number and comment number(total=1000 posts)')
fig.show()
In [25]:
# Relationship of upvote ratio and comment number
fig=px.scatter(post_df,x='post_upvote_ratio',y='post_num_comments',title='Upvote ratio and comment number(total=1000 posts)')
fig.show()
In [58]:
from IPython.core.interactiveshell import InteractiveShell  
InteractiveShell.ast_node_interactivity = "all"
# mean of upvote ratio
post_df['post_upvote_ratio'].mean()

#Top largest upvote ratio
post_df['post_upvote_ratio'].nlargest(10)

#Top smallest upvote ratio
post_df['post_upvote_ratio'].nsmallest(10)
Out[58]:
0.9476600000000102
Out[58]:
15    0.99
19    0.99
20    0.99
23    0.99
32    0.99
33    0.99
44    0.99
45    0.99
59    0.99
63    0.99
Name: post_upvote_ratio, dtype: float64
Out[58]:
268    0.73
421    0.73
665    0.75
677    0.76
156    0.78
694    0.78
609    0.79
528    0.80
758    0.80
524    0.81
Name: post_upvote_ratio, dtype: float64
In [ ]: